{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "import pyspark.sql.functions as F\n",
    "import pyspark.sql.types as T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = SparkSession.builder.appName(\"HelloSpark\").master('local').getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = spark.read.options(header=True, inferSchema=True).csv(\"./dataframe_basics/BeijingPM20100101_20151231.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.write.json(\"output\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "ename": "AnalysisException",
     "evalue": "path file:/home/guoruiming/workspace/pyla/pyspark/output already exists.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAnalysisException\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m/home/guoruiming/workspace/pyla/pyspark/read_write.ipynb Cell 5'\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2Bmaker-jupyter-zerotier/home/guoruiming/workspace/pyla/pyspark/read_write.ipynb#ch0000004vscode-remote?line=0'>1</a>\u001b[0m df\u001b[39m.\u001b[39;49mwrite\u001b[39m.\u001b[39;49mjson(\u001b[39m\"\u001b[39;49m\u001b[39moutput\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/pyspark/sql/readwriter.py:1095\u001b[0m, in \u001b[0;36mDataFrameWriter.json\u001b[0;34m(self, path, mode, compression, dateFormat, timestampFormat, lineSep, encoding, ignoreNullFields)\u001b[0m\n\u001b[1;32m   1086\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mmode(mode)\n\u001b[1;32m   1087\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_set_opts(\n\u001b[1;32m   1088\u001b[0m     compression\u001b[39m=\u001b[39mcompression,\n\u001b[1;32m   1089\u001b[0m     dateFormat\u001b[39m=\u001b[39mdateFormat,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1093\u001b[0m     ignoreNullFields\u001b[39m=\u001b[39mignoreNullFields,\n\u001b[1;32m   1094\u001b[0m )\n\u001b[0;32m-> 1095\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_jwrite\u001b[39m.\u001b[39;49mjson(path)\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/py4j/java_gateway.py:1321\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m   1315\u001b[0m command \u001b[39m=\u001b[39m proto\u001b[39m.\u001b[39mCALL_COMMAND_NAME \u001b[39m+\u001b[39m\\\n\u001b[1;32m   1316\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcommand_header \u001b[39m+\u001b[39m\\\n\u001b[1;32m   1317\u001b[0m     args_command \u001b[39m+\u001b[39m\\\n\u001b[1;32m   1318\u001b[0m     proto\u001b[39m.\u001b[39mEND_COMMAND_PART\n\u001b[1;32m   1320\u001b[0m answer \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mgateway_client\u001b[39m.\u001b[39msend_command(command)\n\u001b[0;32m-> 1321\u001b[0m return_value \u001b[39m=\u001b[39m get_return_value(\n\u001b[1;32m   1322\u001b[0m     answer, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgateway_client, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mtarget_id, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mname)\n\u001b[1;32m   1324\u001b[0m \u001b[39mfor\u001b[39;00m temp_arg \u001b[39min\u001b[39;00m temp_args:\n\u001b[1;32m   1325\u001b[0m     temp_arg\u001b[39m.\u001b[39m_detach()\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/pyspark/sql/utils.py:196\u001b[0m, in \u001b[0;36mcapture_sql_exception.<locals>.deco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m    192\u001b[0m converted \u001b[39m=\u001b[39m convert_exception(e\u001b[39m.\u001b[39mjava_exception)\n\u001b[1;32m    193\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(converted, UnknownException):\n\u001b[1;32m    194\u001b[0m     \u001b[39m# Hide where the exception came from that shows a non-Pythonic\u001b[39;00m\n\u001b[1;32m    195\u001b[0m     \u001b[39m# JVM exception message.\u001b[39;00m\n\u001b[0;32m--> 196\u001b[0m     \u001b[39mraise\u001b[39;00m converted \u001b[39mfrom\u001b[39;00m \u001b[39mNone\u001b[39m\n\u001b[1;32m    197\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m    198\u001b[0m     \u001b[39mraise\u001b[39;00m\n",
      "\u001b[0;31mAnalysisException\u001b[0m: path file:/home/guoruiming/workspace/pyla/pyspark/output already exists."
     ]
    }
   ],
   "source": [
    "try:\n",
    "    df.write.json(\"output\")\n",
    "except Exception as e:\n",
    "    print(e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.write.mode(\"append\").json(\"output\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.4 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
